# Statements Analysis Script Runner
# Alex F. Gazmararian
# agazmararian@gmail.com
#
# This script runs all necessary scripts to produce the statements analysis output
# Run this script from the project root directory

library(here)

# Load all project functions from R/ modules
source(here("R", "load_functions.R"))

# Configuration ----
message("=== STATEMENTS ANALYSIS SCRIPT RUNNER ===")
start_time <- Sys.time()

# Configuration options
SKIP_RENV_CHECK <- getOption("statements.skip_renv_check", FALSE)
SKIP_ROBUSTNESS_CHECKS <- getOption("statements.skip_robustness", FALSE)

# PNAS mode: when TRUE (default), only outputs for the paper are generated
# When FALSE, additional diagnostic/internal outputs are also generated
pnas <- getOption("pnas.mode", TRUE)

# Environment Validation ----
message("=== ENVIRONMENT VALIDATION ===")

# Check R version
r_version <- getRversion()
min_r_version <- "4.3.0"
if (r_version < min_r_version) {
  stop(sprintf("R version %s or higher required. Current version: %s", min_r_version, r_version))
}
message(sprintf("[OK] R version: %s", r_version))

# Check renv status and provide user guidance
if (!SKIP_RENV_CHECK && "renv" %in% installed.packages()[,"Package"]) {
  message("Checking renv package synchronization...")
  
  # Check if we're in an renv project
  tryCatch({
    if (file.exists("renv.lock")) {
      # Get renv status
      status_result <- capture.output(suppressMessages(renv::status()), type = "message")
      
      # Check if packages are out of sync
      if (any(grepl("inconsistent|not installed|out of sync", status_result, ignore.case = TRUE))) {
        message("")
        message("[ERROR] ENVIRONMENT SYNCHRONIZATION REQUIRED")
        message("Your installed packages don't match the renv.lock file.")
        message("")
        message("This analysis requires specific package versions for reproducibility.")
        message("Please run ONE of the following commands to synchronize:")
        message("")
        message("Option 1 (Recommended): Restore exact versions from lockfile")
        message("  renv::restore()")
        message("")
        
        message("Option 2: Skip renv check and run with current packages (may cause errors)")
        message("  options(statements.skip_renv_check = TRUE)")
        message("  source('analysis/statements/run_statements_analysis.R')")
        message("")
        
        message("Option 3: Update lockfile to match your current packages")
        message("  renv::snapshot()")
        message("")
        
        stop("Environment synchronization required. Please run renv::restore() or set skip option.")
      } else {
        message("[OK] renv packages are synchronized with lockfile")
      }
    } else {
      message("No renv.lock file found - skipping renv sync check")
    }
  }, error = function(e) {
    message(sprintf("[WARN] Could not check renv status: %s", e$message))
    message("You may need to run renv::restore() to ensure package compatibility")
  })
} else if (SKIP_RENV_CHECK) {
  message("renv check skipped (disabled by configuration)")
} else {
  message("renv not available - skipping package synchronization")
}

# Check for essential directory structure
essential_dirs <- c(
  here("data", "input"),
  here("data", "inter"),
  here("data", "output")
)

missing_dirs <- essential_dirs[!sapply(essential_dirs, dir.exists)]
if (length(missing_dirs) > 0) {
  message("[INFO] Creating missing directories:")
  for (dir in missing_dirs) {
    dir.create(dir, recursive = TRUE, showWarnings = FALSE)
    message(sprintf("  [OK] Created: %s", dir))
  }
}
message("[OK] Essential data directory structure ready")

# System information for reproducibility
message("")
message("=== SYSTEM INFORMATION ===")
si <- Sys.info()
message(sprintf("Operating System: %s %s", si[["sysname"]], si[["release"]]))
message(sprintf("R Version: %s", R.version.string))
message(sprintf("Platform: %s", R.version$platform))

# Memory information
if (si[["sysname"]] != "Windows") {
  tryCatch({
    # Get memory info on Unix-like systems
    mem_info <- system("free -h 2>/dev/null || vm_stat 2>/dev/null || echo 'Memory info unavailable'", intern = TRUE)
    if (length(mem_info) > 1 && !grepl("unavailable", mem_info[1])) {
      message("[OK] Memory: Available")
    }
  }, error = function(e) {
    # Silent fallback
  })
}

message("")
message("Environment validation complete")
message("")

# Ensure output directories exist
output_dirs_to_create <- c(
  here("output", "pnas", "figures"),
  here("output", "pnas", "tables"),
  here("output", "pnas", "stats"),
  here("output", "pnas", "log")
)

for (dir_path in output_dirs_to_create) {
  if (!dir.exists(dir_path)) {
    dir.create(dir_path, recursive = TRUE, showWarnings = FALSE)
    message(sprintf("[OK] Created directory: %s", dir_path))
  }
}

# Clean intermediate data directory for fresh run
inter_dir <- here("data", "inter")
if (dir.exists(inter_dir)) {
  inter_files <- list.files(inter_dir, full.names = TRUE, recursive = TRUE)
  if (length(inter_files) > 0) {
    message(sprintf("Cleaning %d files from intermediate data directory...", length(inter_files)))
    file.remove(inter_files)
    message("Intermediate data directory cleaned for fresh run")
  } else {
    message("Intermediate data directory already empty")
  }
}

# Set up logging ----
log_dir <- here("output", "pnas", "log")
if (!dir.exists(log_dir)) {
  dir.create(log_dir, recursive = TRUE, showWarnings = FALSE)
}

# Check if running under unified PNAS pipeline logging
unified_log_file <- getOption("pnas.unified_log_file", NULL)
if (!is.null(unified_log_file)) {
  # Use the parent pipeline's log file
  log_file <- unified_log_file
} else {
  # Create own log file with timestamp
  log_timestamp <- format(start_time, "%Y%m%d_%H%M%S")
  log_file <- file.path(log_dir, paste0("statements_analysis_", log_timestamp, ".log"))
}

# Function to write to both console and log file
log_message <- function(type_or_format, msg_or_args = NULL, ...) {
  timestamp <- format(Sys.time(), "%Y-%m-%d %H:%M:%S")
  
  # Very defensive function to handle all cases
  tryCatch({
    # Check if this is the old pattern: log_message("INFO", "message")
    if (!is.null(msg_or_args) && length(list(...)) == 0 && 
        type_or_format %in% c("INFO", "WARN", "ERROR", "DEBUG")) {
      # Pattern 1: log_message(type, msg)
      formatted_msg <- sprintf("%s: %s", type_or_format, msg_or_args)
    } else {
      # Pattern 2: log_message(format, ...) - like the runner scripts
      if (is.null(msg_or_args) && length(list(...)) == 0) {
        # Simple message without formatting
        formatted_msg <- as.character(type_or_format)
      } else {
        # Try to format with sprintf
        args <- c(list(msg_or_args), list(...))
        formatted_msg <- do.call(sprintf, c(list(type_or_format), args))
      }
    }
    
    # Write to console
    message(sprintf("[%s] %s", timestamp, formatted_msg))
    
    # Write to log file
    if (file.exists(dirname(log_file))) {
      cat(sprintf("[%s] %s\n", timestamp, formatted_msg), 
          file = log_file, append = TRUE)
    }
    
  }, error = function(e) {
    # Fallback: just print the original message
    message(sprintf("[%s] Logging error, printing directly: %s", 
                   timestamp, as.character(type_or_format)))
  })
}

# Function to run scripts with error handling and warning capture
run_script <- function(script_path, description) {
  log_message("Running: %s", description)
  log_message("Script: %s", script_path)
  
  start_script_time <- Sys.time()
  warnings_captured <- c()
  
  tryCatch({
    # Check if file exists
    if (!file.exists(script_path)) {
      stop(sprintf("Script not found: %s", script_path))
    }
    
    # Source the script with warning capture
    withCallingHandlers({
      source(script_path, local = FALSE)
    }, warning = function(w) {
      # Capture warning message
      warning_msg <- conditionMessage(w)
      warnings_captured <<- c(warnings_captured, warning_msg)
      log_message("WARNING in %s: %s", description, warning_msg)
      
      # Invoke default warning handler to continue execution
      invokeRestart("muffleWarning")
    })
    
    end_script_time <- Sys.time()
    elapsed <- as.numeric(difftime(end_script_time, start_script_time, units = "secs"))
    
    # Report completion with warning count
    if (length(warnings_captured) > 0) {
      log_message("Completed: %s (%.1f seconds, %d warnings)", 
                 description, elapsed, length(warnings_captured))
    } else {
      log_message("Completed: %s (%.1f seconds)", description, elapsed)
    }
    log_message("")
    
    # Return success with warning info
    return(list(success = TRUE, error = FALSE, warnings = warnings_captured))
    
  }, error = function(e) {
    log_message("ERROR in %s: %s", description, e$message)
    log_message("Script path: %s", script_path)
    log_message("")
    
    # Return error info with any warnings that occurred before the error
    return(list(success = FALSE, error = TRUE, message = e$message, script = script_path, warnings = warnings_captured))
  })
}

# Analysis Pipeline ----
log_message("Starting statements analysis pipeline...")
log_message("Timestamp: %s", format(start_time, "%Y-%m-%d %H:%M:%S"))
log_message("")

# Track results
results <- list()

# Step 1: Data Processing and Annotation
log_message("=== PHASE 1: DATA PROCESSING ===")

# Process raw data
results$process <- run_script(
  script_path = here("R", "annotation", "process.R"),
  description = "Process raw statement data"
)

# Run GPT annotation  
results$annotate <- run_script(
  script_path = here("R", "annotation", "annotate.R"),
  description = "GPT-based statement annotation"
)

# Step 2: Covariate Processing
log_message("=== PHASE 2: COVARIATE PROCESSING ===")

# Process BGM/Turner investment data (creates bgm_processed.rds)
results$bgm <- run_script(
  script_path = here("R", "covariates", "process_bgm.R"),
  description = "Process BGM investment database"
)

# Process politician data (governors, senators, representatives)
results$politicians <- run_script(
  script_path = here("R", "covariates", "get_politicians.R"),
  description = "Process politician party affiliation data"
)

# Process project data (depends on BGM data)
results$turner <- run_script(
  script_path = here("R", "covariates", "process_turner_data.R"),
  description = "Process Turner project investment data"
)

# Process demographic and economic covariates
results$census <- run_script(
  script_path = here("R", "covariates", "census.R"),
  description = "Process Census/ACS demographic data"
)

results$broadband <- run_script(
  script_path = here("R", "covariates", "broadband.R"),
  description = "Process broadband coverage data"
)

results$bea <- run_script(
  script_path = here("R", "covariates", "bea.R"),
  description = "Process BEA economic data"
)

results$unemployment <- run_script(
  script_path = here("R", "covariates", "unemployment.R"),
  description = "Process unemployment data"
)

results$unions <- run_script(
  script_path = here("R", "covariates", "unions.R"),
  description = "Process union membership data"
)

results$electricity <- run_script(
  script_path = here("R", "covariates", "electricity_prices.R"),
  description = "Process electricity price data"
)

# Process geographic covariates  
results$highways <- run_script(
  script_path = here("R", "covariates", "highways.R"),
  description = "Process highway access data"
)

# Process election data
results$pres_2020 <- run_script(
  script_path = here("R", "covariates", "pres_elec_2020.R"),
  description = "Process 2020 presidential election data"
)

results$cong_elec <- run_script(
  script_path = here("R", "covariates", "process_cong_elec.R"),
  description = "Process congressional election data"
)

# Step 3: Data Integration
log_message("=== PHASE 3: DATA INTEGRATION ===")

# Merge with covariates
results$merge <- run_script(
  script_path = here("analysis", "statements", "merge.R"),
  description = "Merge annotations with covariates"
)

# Step 4: Statistical Analysis
log_message("=== PHASE 4: ANALYSIS ===")

# Main analysis
results$analyze <- run_script(
  script_path = here("analysis", "statements", "analyze.R"),
  description = "Statistical analysis and visualization"
)

# Check annotation quality
results$check_annotation_quality <- run_script(
  script_path = here("analysis", "statements", "check_annotation_quality.R"),
  description = "Check annotation quality"
)

# Step 5: Robustness Checks
if (!SKIP_ROBUSTNESS_CHECKS) {
  log_message("=== PHASE 5: ROBUSTNESS CHECKS ===")

  # Alternative annotation robustness check
  results$robustness_annotation <- run_script(
    script_path = here("R", "annotation", "run_robustness_check.R"),
    description = "Alternative codebook annotation robustness check"
  )

  # Merge robustness check annotations with covariates
  results$robustness_merge <- run_script(
    script_path = here("analysis", "statements", "run_merge_robustness.R"),
    description = "Merge robustness check annotations with covariates"
  )

  # Statistical analysis for robustness checks
  results$robustness_analysis <- run_script(
    script_path = here("analysis", "statements", "run_analysis_robustness.R"),
    description = "Statistical analysis for robustness checks"
  )
} else {
  log_message("=== SKIPPING ROBUSTNESS CHECKS ===")
  log_message("Robustness checks skipped per configuration")
  log_message("To enable robustness checks, set: options(statements.skip_robustness = FALSE)")
  log_message("")
}

# Summary ----
end_time <- Sys.time()
total_elapsed <- as.numeric(difftime(end_time, start_time, units = "mins"))

log_message("=== PIPELINE COMPLETE ===")
log_message("Total runtime: %.1f minutes", total_elapsed)
log_message("Start time: %s", format(start_time, "%Y-%m-%d %H:%M:%S"))
log_message("End time: %s", format(end_time, "%Y-%m-%d %H:%M:%S"))

# Final system information summary
log_message("")
log_message("=== REPRODUCIBILITY INFORMATION ===")
log_message("R Version: %s", R.version.string)
log_message("Platform: %s", R.version$platform)
log_message("System: %s %s", Sys.info()[["sysname"]], Sys.info()[["release"]])
log_message("Working Directory: %s", getwd())

# Memory usage at completion
tryCatch({
  if (Sys.info()[["sysname"]] == "Darwin") {
    # macOS memory info
    mem_pressure <- system("memory_pressure 2>/dev/null | head -1", intern = TRUE)
    if (length(mem_pressure) > 0 && !grepl("command not found", mem_pressure)) {
      log_message("Memory Status: %s", mem_pressure)
    }
  } else if (Sys.info()[["sysname"]] == "Linux") {
    # Linux memory info
    mem_info <- system("free -h | head -2", intern = TRUE)
    if (length(mem_info) >= 2) {
      log_message("Memory Usage: %s", gsub("\\s+", " ", mem_info[2]))
    }
  }
}, error = function(e) {
  # Silent fallback - memory info is nice-to-have
})

# R session information
log_message("Locale: %s", Sys.getlocale("LC_CTYPE"))
log_message("Timezone: %s", Sys.timezone())

# Package information for key dependencies
key_packages_info <- c("here", "dplyr", "openai", "modelsummary", "renv")
existing_packages <- key_packages_info[key_packages_info %in% installed.packages()[,"Package"]]
if (length(existing_packages) > 0) {
  log_message("Key package versions:")
  for (pkg in existing_packages) {
    version <- packageVersion(pkg)
    log_message("  %s: %s", pkg, version)
  }
}

# renv status if available
if ("renv" %in% installed.packages()[,"Package"]) {
  tryCatch({
    if (file.exists("renv.lock")) {
      log_message("renv project detected: %s", if(renv::project() == getwd()) getwd() else "renv.lock found")
      
      # Get lockfile hash for reproducibility tracking
      lockfile_path <- file.path(getwd(), "renv.lock")
      if (file.exists(lockfile_path)) {
        lockfile_hash <- tools::md5sum(lockfile_path)
        log_message("renv.lock hash: %s", lockfile_hash)
      }
      
      # Final sync status check
      final_status <- tryCatch({
        status_output <- capture.output(suppressMessages(renv::status()), type = "message")
        if (any(grepl("inconsistent|not installed|out of sync", status_output, ignore.case = TRUE))) {
          "packages out of sync"
        } else {
          "packages synchronized"
        }
      }, error = function(e) {
        "status check failed"
      })
      log_message("renv final status: %s", final_status)
    } else {
      log_message("renv: no lockfile found")
    }
  }, error = function(e) {
    log_message("renv status: error checking - %s", e$message)
  })
} else {
  log_message("renv: not installed")
}

# Check for errors and warnings - use safer access with explicit checks
errors <- purrr::keep(results, ~ is.list(.) && "error" %in% names(.) && isTRUE(.$error))
warnings_list <- purrr::keep(results, ~ is.list(.) && "warnings" %in% names(.) && length(.$warnings) > 0)

if (length(errors) > 0) {
  log_message("")
  log_message("[ERROR] ERRORS ENCOUNTERED:")
  for (i in seq_along(errors)) {
    error_info <- errors[[i]]
    log_message("  %s: %s", error_info$script, error_info$message)
  }
  log_message("")
  log_message("Check log file for details: %s", log_file)
} else if (length(warnings_list) > 0) {
  log_message("")
  log_message("[WARN] WARNINGS ENCOUNTERED:")
  total_warnings <- 0
  for (i in seq_along(warnings_list)) {
    warning_info <- warnings_list[[i]]
    warning_count <- length(warning_info$warnings)
    total_warnings <- total_warnings + warning_count
    script_name <- names(warnings_list)[i]
    log_message("  %s: %d warnings", script_name, warning_count)
  }
  log_message("")
  log_message("Total warnings: %d", total_warnings)
  log_message("Check log file for warning details: %s", log_file)
  log_message("[OK] All scripts completed with warnings")
} else {
  log_message("[OK] All scripts completed successfully!")
}

log_message("")
log_message("=== KEY OUTPUTS ===")
log_message("Main dataset: %s", here("data", "output", "statements_analysis.csv"))
if (!SKIP_ROBUSTNESS_CHECKS) {
  log_message("Robustness datasets: %s", here("data", "output", "statements_analysis_*.csv"))
  log_message("Robustness comparison: %s", here("data", "output", "robustness_comparison.csv"))
}
log_message("Tables: %s", here("output", "pnas", "tables"))
log_message("Figures: %s", here("output", "pnas", "figures"))
log_message("Intermediate data: %s", here("data", "inter"))
log_message("")
log_message("Log file: %s", log_file)
log_message("")
if (!SKIP_ROBUSTNESS_CHECKS) {
  log_message("Analysis complete with robustness checks! Check output directories for results.")
} else {
  log_message("Analysis complete! Check output directories for results.")
  log_message("Note: Robustness checks were skipped. To run them:")
  log_message("  options(statements.skip_robustness = FALSE)")
  log_message("  source('analysis/statements/run_statements_analysis.R')")
}